Have you ever wanted predict outcomes of NBA games in real time as the games are occurring? This webpage is part 1 or a 2 part series that will describe the methods you can use to build your own prediction model. We will leverage IBM’s Data Science Experience environment with Rstudio to build linear and logistic regression models using R and Spark.
datascience.ibm.com
As a final step, we deploy an app in IBM’s Bluemix using NodeJS. The site is live and lets you interact with the model that was built using the analysis from the R.
optional caption text
This site holds the live hosted website running the models from the analysis http://169.55.24.28:6001/
All the source for this demo including the HOWTO is located on GitHub. https://github.com/dustinvanstee/nba-rt-prediction
Lets get started!
Note: run install.packages() before loading the packages.
# install.packages("dplyr") # general data manipulation: summarise, filter, etc.
# install.packages("plyr")
# install.packages("chron") # manipulation of date/time data
# install.packages("scatterD3") # interactive plots
# install.packages("plotly")
# install.packages("RCurl")
library(RCurl)
library(dplyr)
library(plyr)
library(chron)
library(scatterD3)
library(plotly)
#Curl data from Github
nba_scores_lines <- readLines(textConnection(getURL("https://raw.githubusercontent.com/dustinvanstee/nba-rt-prediction/master/scores_nba.test.dat")))
# Split CSV line array into tokens, and load them into dataframe
nba_scores_DF <- as.data.frame(do.call(rbind, strsplit(nba_scores_lines, ",")), stringsAsFactors=FALSE)
## Warning in (function (..., deparse.level = 1) : number of columns of result is not a multiple of vector length (arg 14628)
# Since there isnt header in the data set, specify the column metadata
colnames(nba_scores_DF) <- c("dateOrig","ts","teamlonga", "scorea", "teamlongb", "scoreb", "timestring", "timeleft", "gameid")
# Apply Types to the data
nba_scores_DF2 <- transform(nba_scores_DF,
dateOrig = as.Date(dateOrig),
ts = as.character(ts),
teamlonga = as.character(teamlonga),
scorea = as.numeric (scorea),
teamlongb = as.character(teamlongb),
scoreb = as.numeric (scoreb),
timestring = as.character(timestring),
timeleft = as.numeric(timeleft),
gameid = as.character(gameid))
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
This data is the raw input that contains a record for each update of the game. Each game has approximately ~120 data points from start to finish. Games were sampled on 1 minute intervals. The data has some errors and redundancies that will be removed. The first step needed is to seperate the in game scores and the final scores. The final score outcome will end up being the value that will try to be predicted , and therefore must be appended to every in game score.
# NAs (not available) are introduced because the raw data has invalid data points, so remove these observations
rtscoresAndFinalDF <- na.omit(nba_scores_DF2)
# Print the dimensions of the data. Rows are the number of individual score data points.
dim(rtscoresAndFinalDF) #16746 9
## [1] 16746 9
# Take a look at the first few rows of the dataframe
head(rtscoresAndFinalDF)
## dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid
## 1 2016-04-05 15:06:16 Phoenix 0 Atlanta 0 (8:00 PM ET) 48 400829044
## 2 2016-04-05 15:06:16 Chicago 0 Memphis 0 (8:00 PM ET) 48 400829045
## 3 2016-04-05 15:06:16 Cleveland 0 Milwaukee 0 (8:00 PM ET) 48 400829046
## 4 2016-04-05 15:06:16 Oklahoma City 0 Denver 0 (9:00 PM ET) 48 400829047
## 5 2016-04-05 15:06:16 New Orleans 0 Philadelphia 0 (7:00 PM ET) 48 400829041
## 6 2016-04-05 15:06:16 Detroit 0 Miami 0 (8:00 PM ET) 48 400829042
# Final Scores
head(filter(rtscoresAndFinalDF, grepl("FINAL", timestring)))
## dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid
## 1 2016-04-05 21:22:09 New Orleans 93 Philadelphia 107 (FINAL) 0 400829041
## 2 2016-04-05 22:08:42 Charlotte 90 Toronto 96 (FINAL) 0 400829043
## 3 2016-04-05 22:25:25 Chicago 92 Memphis 108 (FINAL) 0 400829045
## 4 2016-04-05 22:28:58 Phoenix 90 Atlanta 103 (FINAL) 0 400829044
## 5 2016-04-05 22:30:29 Cleveland 109 Milwaukee 80 (FINAL) 0 400829046
## 6 2016-04-05 22:30:29 Detroit 89 Miami 107 (FINAL) 0 400829042
# Scores from 1st quarter
head(filter(rtscoresAndFinalDF, grepl("1ST", timestring)))
## dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid
## 1 2016-04-05 19:23:42 New Orleans 23 Philadelphia 12 (4:39 IN 1ST) 40.65000 400829041
## 2 2016-04-05 19:23:57 New Orleans 23 Philadelphia 14 (4:05 IN 1ST) 40.08333 400829041
## 3 2016-04-05 19:24:13 New Orleans 23 Philadelphia 14 (3:41 IN 1ST) 39.68333 400829041
## 4 2016-04-05 19:24:28 New Orleans 23 Philadelphia 14 (3:32 IN 1ST) 39.53333 400829041
## 5 2016-04-05 19:24:43 New Orleans 23 Philadelphia 16 (3:24 IN 1ST) 39.40000 400829041
## 6 2016-04-05 19:25:29 New Orleans 23 Philadelphia 16 (3:11 IN 1ST) 39.18333 400829041
The odds data and score data files had a different naming conventions for the teams. This function will be used to map all long team names into 3 letter acronym
# Function to turn long team name to short
teamMap <- function(x) {
tnames <- data.frame(
long = as.factor(c("Atlanta", "Boston", "Brooklyn", "Charlotte", "Chicago",
"Cleveland", "Dallas", "Denver", "Detroit", "Golden State",
"Houston","Indiana", "LA Clippers", "LA Lakers", "Memphis",
"Miami", "Milwaukee", "Minnesota", "New Orleans", "New York",
"Oklahoma City", "Orlando", "Philadelphia", "Phila.", "Phoenix",
"Portland", "Sacramento", "San Antonio", "Toronto", "Utah", "Washington")),
short = as.factor(c("atl", "bos", "bkn", "cha", "chi",
"cle", "dal", "den", "det", "gst",
"hou", "ind", "lac", "lal", "mem",
"mia", "mil", "min", "nor", "nyk",
"okc", "orl", "phi", "phi", "pho",
"por", "sac", "san", "tor", "uta", "wsh"))
)
df_x <- data.frame(long=x)
short <- tnames$short[match(df_x$long, tnames$long)]
return(short)
}
# Function to convert 3-character month to 2-digit numeric month
monthMap <-function(x) {
a <-data.frame(
str = as.factor(c("Jan", "Feb", "Mar", "Apr", "May",
"Jun", "Jul", "Aug", "Sep", "Oct",
"Nov", "Dec")),
num = as.factor(c("01", "02", "03", "04", "05",
"06", "07", "08", "09", "10",
"11", "12"))
)
df_x <- data.frame(str=x)
num <- a$num[match(df_x$str, a$str)]
return(num)
}
# Unique key for each game consists of date, home team, away team. For games that span multiple days due to
# continuing through midnight, date logic is required to adjust some of the score data.
# Inputs : input date, timestamp
# Retuns : adjusted date
# If time is midnight -> 3am EST, then adjust
dateadjustudf <- function(datein, tsin){
newdate <- c()
for (i in 1:length(tsin)){
if (grepl("^0[0-3]", tsin[i])) {
newdate[i] = datein[i] - 1
} else {
newdate[i] = datein[i]
}
}
return(newdate)
}
Remove overtime, add keys for joins, and perform date transformations
# Remove Overtime games from this analysis
rtscoresAndFinalDF <- filter(rtscoresAndFinalDF, !grepl(".*OT.*", timestring))
#16626
# Create short 3 character team names
rtscoresAndFinalDF$teama <- teamMap(rtscoresAndFinalDF$teamlonga)
rtscoresAndFinalDF$teamb <- teamMap(rtscoresAndFinalDF$teamlongb)
# Add a score differential Column
rtscoresAndFinalDF$scorea_scoreb <- rtscoresAndFinalDF$scorea - rtscoresAndFinalDF$scoreb
# Transform the Date. This is for games that spanned multiple days.
# Games adjusted to the day they started on.
rtscoresAndFinalDF$date <- dateadjustudf(rtscoresAndFinalDF$dateOrig, rtscoresAndFinalDF$ts)
rtscoresAndFinalDF$date <- as.Date(rtscoresAndFinalDF$date, origin = "1970-01-01")
# Create a key to join with odds data later. Key = date.teama.teamb
for (i in 1:nrow(rtscoresAndFinalDF)){
rtscoresAndFinalDF$key[i] <- paste0(rtscoresAndFinalDF$date[i], ".", rtscoresAndFinalDF$teama[i], ".", rtscoresAndFinalDF$teamb[i])
}
#rtscoresAndFinalDF$key2 <- paste(rtscoresAndFinalDF$date, rtscoresAndFinalDF$teama, rtscoresAndFinalDF$teamb, sep=".")
Based on the way the data was sampled, both i ngame scores and final scores are written as seperate records to the same file. For building predictive models, each in game score needs to have the final score appended to it. After the data is seperated, a few extra features will be added to the in game scores, and then the in game and final scores will be joined.
# Create Final Score DF
# filter out any score that has FINAL
finalscoresDF <- filter(rtscoresAndFinalDF, grepl("FINAL", timestring))
# Rename some columns so that join later doesnt have name overlaps
finalscoresDF$fscorea <- finalscoresDF$scorea
finalscoresDF$fscoreb <- finalscoresDF$scoreb
# Create final score difference
finalscoresDF$fscorea_fscoreb <- finalscoresDF$fscorea - finalscoresDF$fscoreb
finalscoresDF$fscoreb_fscorea <- finalscoresDF$fscoreb - finalscoresDF$fscorea
# Add a Win/loss column Win = 1, Loss = 0
for (i in 1 : nrow(finalscoresDF)){
if (finalscoresDF$fscorea_fscoreb[i] > 0){
finalscoresDF$home_win[i] <- 0
finalscoresDF$away_win[i] <- 1
} else {
finalscoresDF$home_win[i] <- 1
finalscoresDF$away_win[i] <- 0
}
}
#################################################################################################################
# Create In Game score DF and remove some problematic data points.
# Remove halftime records and these other cases as datasource doesnt always update the quarter change well
rtscoresDF <- filter(rtscoresAndFinalDF, !grepl('HALF', timestring), !grepl('FINAL', timestring),
timestring != "(12:00 IN 1ST)" ,
timestring != "(12:00 IN 2ND)" ,
timestring != "(12:00 IN 3RD)" ,
timestring != "(12:00 IN 4TH)" ,
timestring != "(END OF 1ST)" ,
timestring != "(END OF 2ND)" ,
timestring != "(END OF 3RD)" ,
timestring != "(END OF 4TH)" )
# Create in game score difference
rtscoresDF$scorea_scoreb <- rtscoresDF$scorea - rtscoresDF$scoreb
rtscoresDF$scoreb_scorea <- rtscoresDF$scoreb - rtscoresDF$scorea
# Create a game PCT complete and PCT left indictor
rtscoresDF$pct_complete <- (((rtscoresDF$timeleft * -1) + 48 )/48.0)*100
rtscoresDF$pct_left <- 100 - rtscoresDF$pct_complete
# Create some custom features that weight score difference more as the game comes near to finish
# These features were added as initial models did not fit the end of game well.
rtscoresDF$cf1 <- (1/((rtscoresDF$pct_left/25 + .01)^.5)) * rtscoresDF$scoreb_scorea
rtscoresDF$cf2 <- (1/((rtscoresDF$pct_left/2.0 + .01)^1.3))*rtscoresDF$scoreb_scorea
After building the initial model without custom features, the logistic model was not adjusting the probabilities well at the end of the games. There some examples when there was 0 minutes left in the game, and yet the logistic model was giving a 70% chance of victory for a team. This was due to the fact that the original features were not fitting the end of game very well. To fix this, a custom feature was added that takes the score difference and amplifies it as the score nears the end of the game. This feature dominates at the end of games and helps fit the data at the end of games.
# subset a dataframe for scatterplot
# spreader <- filter(rtscoresDF, pct_complete < 95)
# draw interactive scatter plot
scatterD3(x = rtscoresDF$pct_complete, y = rtscoresDF$scoreb_scorea, col_var = rtscoresDF$key, xlab = "% of Game Complete", ylab = "score difference", xlim = c(0,100), point_size = 10)
scatterD3(x = rtscoresDF$pct_complete, y = rtscoresDF$cf1, col_var = rtscoresDF$key, xlab = "% of Game Complete", ylab = "score difference amplified", xlim = c(0,100), ylim = c(-20,20), point_size = 10)
***
## [1] "Final scores data frame"
## dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid teama teamb scorea_scoreb date key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea home_win away_win
## 1 2016-04-05 21:22:09 New Orleans 93 Philadelphia 107 (FINAL) 0 400829041 nor phi -14 2016-04-05 2016-04-05.nor.phi 93 107 -14 14 1 0
## 2 2016-04-05 22:08:42 Charlotte 90 Toronto 96 (FINAL) 0 400829043 cha tor -6 2016-04-05 2016-04-05.cha.tor 90 96 -6 6 1 0
## 3 2016-04-05 22:25:25 Chicago 92 Memphis 108 (FINAL) 0 400829045 chi mem -16 2016-04-05 2016-04-05.chi.mem 92 108 -16 16 1 0
## 4 2016-04-05 22:28:58 Phoenix 90 Atlanta 103 (FINAL) 0 400829044 pho atl -13 2016-04-05 2016-04-05.pho.atl 90 103 -13 13 1 0
## 5 2016-04-05 22:30:29 Cleveland 109 Milwaukee 80 (FINAL) 0 400829046 cle mil 29 2016-04-05 2016-04-05.cle.mil 109 80 29 -29 0 1
## 6 2016-04-05 22:30:29 Detroit 89 Miami 107 (FINAL) 0 400829042 det mia -18 2016-04-05 2016-04-05.det.mia 89 107 -18 18 1 0
## [1] "Total Games = 116"
## [1] "In game scores data frame"
## dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid teama teamb scorea_scoreb date key scoreb_scorea pct_complete pct_left cf1 cf2
## 1 2016-04-05 15:06:16 Phoenix 0 Atlanta 0 (8:00 PM ET) 48 400829044 pho atl 0 2016-04-05 2016-04-05.pho.atl 0 0 100 0 0
## 2 2016-04-05 15:06:16 Chicago 0 Memphis 0 (8:00 PM ET) 48 400829045 chi mem 0 2016-04-05 2016-04-05.chi.mem 0 0 100 0 0
## 3 2016-04-05 15:06:16 Cleveland 0 Milwaukee 0 (8:00 PM ET) 48 400829046 cle mil 0 2016-04-05 2016-04-05.cle.mil 0 0 100 0 0
## 4 2016-04-05 15:06:16 Oklahoma City 0 Denver 0 (9:00 PM ET) 48 400829047 okc den 0 2016-04-05 2016-04-05.okc.den 0 0 100 0 0
## 5 2016-04-05 15:06:16 New Orleans 0 Philadelphia 0 (7:00 PM ET) 48 400829041 nor phi 0 2016-04-05 2016-04-05.nor.phi 0 0 100 0 0
## 6 2016-04-05 15:06:16 Detroit 0 Miami 0 (8:00 PM ET) 48 400829042 det mia 0 2016-04-05 2016-04-05.det.mia 0 0 100 0 0
## [1] "Total Number of rt score records = 15947"
How to Interpret the Raw Odds data
Example Golden State -12.5 O (207.0) -125.0 | Detroit 12.5 U (207.0) 145.0
The away team is listed first, and the home team is second
Here Golden State is a 12.5 pt favorite to win. The over under is in parentheses (207) and is the 50/50 line between teams sum of scores
being above/below that line.
Finally the -125 / +145 numbers are whats known at the moneyline odds.
A negative number means you need to bet 125$ to get a 100$ payout
A positive number means you need to bet 100$ to get a 145$ payout
xml <- readLines(textConnection(getURL("https://raw.githubusercontent.com/dustinvanstee/nba-rt-prediction/master/nbaodds_042516.xml")))
# use regular expression to catch info we need
odds <- lapply(xml, function(x) substr(x, regexpr(">", x) + 1, regexpr("/", x) - 2))
odds_split <- lapply(odds, function(x) unlist(strsplit(x, " ")))
# get teamlonga
teamlonga_0 <- lapply(odds_split, function(x) paste(x[1], x[2]))
teamlonga <- lapply(teamlonga_0, function(x){
if (regexpr("[0-9|-]", x) > -1) {
substr(x, 1, regexpr("[0-9|-]", x)-2)
} else{
x
}
})
# get teamlongb
teamlongb_0 <- lapply(odds_split, function(x) paste(x[7],x[8], x[9]))
teamlongb_1 <- lapply(teamlongb_0, function(x){
if (regexpr("[0-9]", x) > -1) {
substr(x, regexpr("[A-Za-z]", x), regexpr("[0-9-]", x)-2)
} else{
x
}
})
teamlongb <- lapply(teamlongb_1, function(x){
if (regexpr("|", x) > -1){
substr(x, regexpr("[A-Za-z]", x), nchar(x))
} else {
x
}
})
# teamaspread
teamaspread_0 <- lapply(odds, function(x){
substr(x, regexpr("[0-9-]",x), regexpr("[0-9-]",x)+4)
})
teamaspread <- lapply(teamaspread_0, function(x){
if (regexpr("[ ]", x) > 0){
substr(x, 1, regexpr("[ ]", x)-1)
} else {
x
}
})
# overunder
overunder <- lapply(odds, function(x){
substr(x, regexpr("[(]", x) + 1, regexpr("[)]", x) - 1)
})
# teamaml
teamaml <- lapply(odds, function(x){
substr(x,regexpr("[)]", x) + 2, regexpr("[|]", x) - 2 )
})
# teambml
teambml <- lapply(odds, function(x){
substr(x, gregexpr("[)]", x)[[1]][2]+2, gregexpr("[(]", x)[[1]][3]-2)
})
#get date
dateStr <- lapply(odds, function(x){
month <- substr(x, gregexpr("[(]", x)[[1]][3]+1, gregexpr("[(]", x)[[1]][3]+3)
month_num <- monthMap(month)
date <- substr(x, gregexpr("[(]", x)[[1]][3]+5, gregexpr("[(]", x)[[1]][3]+6)
year <- substr(x, gregexpr("[(]", x)[[1]][3]+9, gregexpr("[(]", x)[[1]][3]+12)
paste0(year, "-", month_num, "-", date)
})
# get short team names
teama <- lapply(teamlonga, teamMap)
teamb <- lapply(teamlongb, teamMap)
# bind all column together into dataframe
oddsDF <- na.omit(do.call(rbind, Map(data.frame, teamlonga=teamlonga, teama=teama, teamlongb=teamlongb, teamb=teamb, teamaspread=teamaspread, overunder=overunder, teamaml=teamaml, teambml=teambml, dateStr=dateStr)))
# change to right data type and create a key for join later
oddsDF$teamaspread <- as.numeric(as.character(oddsDF$teamaspread))
oddsDF$overunder <- as.numeric(as.character(oddsDF$overunder))
oddsDF$teamaml <- as.numeric(as.character(oddsDF$teamaml))
oddsDF$teambml <- as.numeric(as.character(oddsDF$teambml))
oddsDF$teama <- as.character(oddsDF$teama)
oddsDF$teamb <- as.character(oddsDF$teamb)
oddsDF$key <- paste0(oddsDF$dateStr, ".", oddsDF$teama, ".", oddsDF$teamb)
# Print the Dimensions of the data. Currently collected 161 games
dim(oddsDF) #161 10
## [1] 161 10
# add the groupby and average below because some games had odds over multiple days, and it was adding noise to the analysis
oddsDF2 <- ddply(oddsDF, c("key", "teamlonga", "teamlongb", "teama", "teamb", "dateStr"), summarise,
teamaspread = mean(teamaspread),
overunder = mean(overunder),
teamaml = mean(teamaml),
teambml = mean(teambml))
# Create a few new columns for later analysis
oddsDF2$teambspread <- oddsDF2$teamaspread * -1
oddsDF2$teama_vegas_fscore <- (oddsDF2$overunder / 2.0) - (oddsDF2$teamaspread / 2.0)
oddsDF2$teamb_vegas_fscore <- (oddsDF2$overunder / 2.0) + (oddsDF2$teamaspread / 2.0)
head(oddsDF2)
## key teamlonga teamlongb teama teamb dateStr teamaspread overunder teamaml teambml teambspread teama_vegas_fscore teamb_vegas_fscore
## 1 2016-04-05.cha.tor Charlotte Toronto cha tor 2016-04-05 4.0 200.5 155 -175 -4.0 98.25 102.25
## 2 2016-04-05.chi.mem Chicago Memphis chi mem 2016-04-05 -3.0 201.5 -150 130 3.0 102.25 99.25
## 3 2016-04-05.cle.mil Cleveland Milwaukee cle mil 2016-04-05 -7.5 203.0 -340 280 7.5 105.25 97.75
## 4 2016-04-05.det.mia Detroit Miami det mia 2016-04-05 4.0 202.0 160 -190 -4.0 99.00 103.00
## 5 2016-04-05.lal.lac LA Lakers LA Clippers lal lac 2016-04-05 14.5 208.0 -110 -110 -14.5 96.75 111.25
## 6 2016-04-05.min.gst Minnesota Golden State min gst 2016-04-05 15.5 225.0 -110 -110 -15.5 104.75 120.25
paste("total home teams = ", length(unique(oddsDF2$teama)))
## [1] "total home teams = 30"
paste("total away teams = ", length(unique(oddsDF2$teamb)))
## [1] "total away teams = 30"
paste("total games collected = ", nrow(oddsDF2))
## [1] "total games collected = 111"
Here we are averaging the away spread per team. If the bar is above the zero line, then the team is an underdog, and under the line the team is the favorite. 8 of the 32 teams were favorites on the road including Golden State and Cleveland…
# visualize away spread data
avg_away_spread <- ddply(oddsDF2, c("teamlonga", "teamlongb"), summarise,
awayspread_avg_teamaspread = mean(teamaspread),
awayspread_avg_teambspread = mean(teambspread))
# away spread group by teama
away_spread_teama <- ddply(avg_away_spread, c("teamlonga"), summarise,
teamaspread = mean(awayspread_avg_teamaspread))
# order by teama
away_spread_teama$teamlonga <- as.character(away_spread_teama$teamlonga)
away_spread_teama <- away_spread_teama[order(away_spread_teama$teamlonga), ]
# barchart
p <- plot_ly(
x = away_spread_teama$teamlonga,
y = away_spread_teama$teamaspread,
type = "bar")
p
Here we are averaging the home spread per team. If the bar is above the zero line, then the team is an underdog, and under the line the team is the favorite. Note here that the home teams are favored much more, with the usual suspects having a very large advantage (SAN/GST/OKC)
# spread group by teamb
away_spread_teamb <- ddply(avg_away_spread, c("teamlongb"), summarise,
teambspread = mean(awayspread_avg_teambspread))
# order by teamb
away_spread_teamb$teamlongb <- as.character(away_spread_teamb$teamlongb)
away_spread_teamb <- away_spread_teamb[order(away_spread_teamb$teamlongb), ]
p <- plot_ly(
x = away_spread_teamb$teamlongb,
y = away_spread_teamb$teambspread,
type = "bar")
p
# Here is where the Odds/In Games scores/ Final Scores are joined into one wholistic data set as input for Logistic/Linear regression
# Create a smaller Final Score Dataframe and prune away some columns. Just keep the key, final score a and b, the win/loss indicator
finalslicedscoresDF <- finalscoresDF[c("key","fscorea", "fscoreb", "fscorea_fscoreb", "fscoreb_fscorea", "away_win", "home_win")]
# First Join the 2 smallest data frames ... odd and final.
gameDF <- merge(finalslicedscoresDF, oddsDF2, by = "key")
gameDF$teamlonga <- NULL
gameDF$teamlongb <- NULL
gameDF$teama <- NULL
gameDF$teamb <- NULL
# Print Out the Game Dataframe ... notice we have the odds data merged with the win loss data ....
print("gameDF")
## [1] "gameDF"
head(gameDF)
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea away_win home_win dateStr teamaspread overunder teamaml teambml teambspread teama_vegas_fscore teamb_vegas_fscore
## 1 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4.0 200.5 155 -175 -4.0 98.25 102.25
## 2 2016-04-05.chi.mem 92 108 -16 16 0 1 2016-04-05 -3.0 201.5 -150 130 3.0 102.25 99.25
## 3 2016-04-05.cle.mil 109 80 29 -29 1 0 2016-04-05 -7.5 203.0 -340 280 7.5 105.25 97.75
## 4 2016-04-05.det.mia 89 107 -18 18 0 1 2016-04-05 4.0 202.0 160 -190 -4.0 99.00 103.00
## 5 2016-04-05.lal.lac 81 103 -22 22 0 1 2016-04-05 14.5 208.0 -110 -110 -14.5 96.75 111.25
## 6 2016-04-05.nor.phi 93 107 -14 14 0 1 2016-04-05 2.5 207.0 125 -145 -2.5 102.25 104.75
paste("total games collected:", nrow(gameDF)) #103
## [1] "total games collected: 103"
# Here we show that the better a team is (negative spread, the more they are likely to win ...)
#Here the spread at the start of the game is a decent predictor regarding the end result
# Final Score Difference vs Spread
# Top Left indicates teams with a large pos spread will lose by a wider margin
# the line should approx pass through 0,0
# lower Right indicates teams with large neg spread will win by a wider margin
# The logistic and linear models we build will quantify this for us later!
scatterD3(x = gameDF$fscoreb_fscorea, y = gameDF$teamaspread)
# Here we can show another weak correlation of the vegas overunder/spread to the actual final outcome.
# vegas_fscore was calculated by taking overunder/2 +- the spread/2 to get a projection of
# the home/away teams score
# Here if the prediction and data were perfectly correlated, we would pass through the
# y=x line. in general we follow that path
# we will see how this term plays when we dig into the linear model
# here only home team is shown, but same trend holds for away team
# Home
scatterD3(x = gameDF$teamb_vegas_fscore, y = gameDF$fscoreb);
# Away
scatterD3(x = gameDF$teama_vegas_fscore, y = gameDF$fscorea)
# This is the bigger merge. Merging the odds/final score data with the in game indicators ...
lrDF <- merge(gameDF, rtscoresDF, by = "key")
print("lrDF : Logistic Regression Data Frame")
## [1] "lrDF : Logistic Regression Data Frame"
head(lrDF)
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea away_win home_win dateStr teamaspread overunder teamaml teambml teambspread teama_vegas_fscore teamb_vegas_fscore dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid teama teamb scorea_scoreb date scoreb_scorea pct_complete pct_left cf1 cf2
## 1 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 20:32:49 Charlotte 42 Toronto 52 (1:12 IN 2ND) 25.20000 400829043 cha tor -10 2016-04-05 10 47.500000 52.50000 6.884284 0.14286204
## 2 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 20:56:38 Charlotte 44 Toronto 58 (8:56 IN 3RD) 20.93333 400829043 cha tor -14 2016-04-05 14 56.388889 43.61111 10.569592 0.25452584
## 3 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 20:08:41 Charlotte 19 Toronto 28 (11:08 IN 2ND) 35.13333 400829043 cha tor -9 2016-04-05 9 26.805556 73.19444 5.250891 0.08348447
## 4 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 20:26:10 Charlotte 35 Toronto 41 (4:11 IN 2ND) 28.18333 400829043 cha tor -6 2016-04-05 6 41.284722 58.71528 3.906817 0.07411763
## 5 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 20:32:34 Charlotte 40 Toronto 52 (1:30 IN 2ND) 25.50000 400829043 cha tor -12 2016-04-05 12 46.875000 53.12500 8.212631 0.16881814
## 6 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 19:49:35 Charlotte 9 Toronto 12 (7:18 IN 1ST) 43.30000 400829043 cha tor -3 2016-04-05 3 9.791667 90.20833 1.577128 0.02120872
paste("total data points collected:", nrow(lrDF)) #13412
## [1] "total data points collected: 13412"
# Add an overunder/spread adjusted projection as points are scored during the game
# I found this is a strong indicator
lrDF$teama_adj_fscore <- ((lrDF$pct_complete * -1)/100 + 1) * lrDF$teama_vegas_fscore + lrDF$scorea
lrDF$teamb_adj_fscore <- ((lrDF$pct_complete * -1)/100 + 1) * lrDF$teamb_vegas_fscore + lrDF$scoreb
lrDF$pfscoreb_pfscorea <- lrDF$teamb_adj_fscore - lrDF$teama_adj_fscore
# There is an issue with the data I had captured. When a quarter transitions from 1st->2nd (etc,etc), sometime the timestring doesn't get updated properly. Since I used the timestring to calculate the timeleft in the game, I would get some rogue data points.
# Example, after 1 min in a game, something the two teams would have scores in the 20's, because it was really at 11 mins in the second quarter.
# My solution was to use the final score sum, and then just scale that down to the time left in the game. I would then compare to the sum of scores i had, and if it was significantly higher, I would remove them. I did this by visual inspection ...
# dfa = departure_from_avg
lrDF$dfa <- (lrDF$fscorea + lrDF$fscoreb)/48 * (lrDF$timeleft * -1 + 48) - (lrDF$scorea + lrDF$scoreb)
lrDF_filtered <- filter(lrDF, dfa > -30)
summary(lrDF_filtered)
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea away_win home_win dateStr teamaspread overunder teamaml teambml teambspread teama_vegas_fscore teamb_vegas_fscore dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid teama teamb scorea_scoreb date scoreb_scorea pct_complete pct_left cf1 cf2 teama_adj_fscore teamb_adj_fscore pfscoreb_pfscorea dfa
## Length:13217 Min. : 68.00 Min. : 80.0 Min. :-38.00 Min. :-29.00 Min. :0.000 Min. :0.000 2016-04-13:1829 Min. :-13.000 Min. :180.5 Min. :-553.33 Min. :-750.0 Min. :-19.000 Min. : 84.38 Min. : 84.92 Min. :2016-04-05 Length:13217 Length:13217 Min. : 0.00 Length:13217 Min. : 0.00 Length:13217 Min. : 0.00 Length:13217 cha : 796 bos : 889 Min. :-44.000 Min. :2016-04-05 Min. :-33.000 Min. : 0.00 Min. : 0.00 Min. :-290.000 Min. :-11545.108 Min. : 66.42 Min. : 75.50 Min. :-34.915 Min. :-26.7865
## Class :character 1st Qu.: 92.00 1st Qu.: 97.0 1st Qu.:-15.00 1st Qu.: -5.00 1st Qu.:0.000 1st Qu.:0.000 2016-04-05:1452 1st Qu.: -3.500 1st Qu.:200.3 1st Qu.:-150.00 1st Qu.:-230.0 1st Qu.: -9.500 1st Qu.: 97.17 1st Qu.:100.50 1st Qu.:2016-04-08 Class :character Class :character 1st Qu.: 28.00 Class :character 1st Qu.: 29.00 Class :character 1st Qu.:10.07 Class :character san : 716 mia : 804 1st Qu.:-11.000 1st Qu.:2016-04-08 1st Qu.: -3.000 1st Qu.: 28.65 1st Qu.: 20.97 1st Qu.: -2.362 1st Qu.: -0.044 1st Qu.: 93.50 1st Qu.: 97.55 1st Qu.: -3.841 1st Qu.: -3.7944
## Mode :character Median : 99.00 Median :105.0 Median : -8.00 Median : 8.00 Median :0.000 Median :1.000 2016-04-08:1448 Median : 4.833 Median :205.5 Median : 7.50 Median :-127.5 Median : -4.833 Median :100.25 Median :104.50 Median :2016-04-11 Mode :character Mode :character Median : 53.00 Mode :character Median : 56.00 Mode :character Median :22.73 Mode :character okc : 671 dal : 687 Median : -3.000 Median :2016-04-11 Median : 3.000 Median : 52.64 Median : 47.36 Median : 1.821 Median : 0.033 Median : 99.58 Median :104.41 Median : 5.383 Median : 0.6389
## Mean : 99.71 Mean :105.3 Mean : -5.59 Mean : 5.59 Mean :0.353 Mean :0.647 2016-04-11:1233 Mean : 3.518 Mean :204.8 Mean : 26.41 Mean :-122.4 Mean : -3.518 Mean :100.66 Mean :104.18 Mean :2016-04-11 Mean : 52.85 Mean : 56.14 Mean :22.40 cle : 639 tor : 639 Mean : -3.293 Mean :2016-04-11 Mean : 3.293 Mean : 53.32 Mean : 46.68 Mean : 4.799 Mean : 31.508 Mean : 99.85 Mean :104.79 Mean : 4.938 Mean : 0.4491
## 3rd Qu.:107.00 3rd Qu.:113.0 3rd Qu.: 5.00 3rd Qu.: 15.00 3rd Qu.:1.000 3rd Qu.:1.000 2016-04-10:1168 3rd Qu.: 9.500 3rd Qu.:209.5 3rd Qu.: 190.00 3rd Qu.: 115.0 3rd Qu.: 3.500 3rd Qu.:103.50 3rd Qu.:108.75 3rd Qu.:2016-04-14 3rd Qu.: 77.00 3rd Qu.: 82.00 3rd Qu.:34.25 tor : 578 hou : 593 3rd Qu.: 3.000 3rd Qu.:2016-04-13 3rd Qu.: 11.000 3rd Qu.: 79.03 3rd Qu.: 71.35 3rd Qu.: 9.017 3rd Qu.: 0.244 3rd Qu.:106.12 3rd Qu.:111.73 3rd Qu.: 14.159 3rd Qu.: 5.0549
## Max. :131.00 Max. :144.0 Max. : 29.00 Max. : 38.00 Max. :1.000 Max. :1.000 2016-04-06:1108 Max. : 19.000 Max. :225.2 Max. : 541.67 Max. : 410.0 Max. : 13.000 Max. :115.75 Max. :119.12 Max. :2016-04-24 Max. :131.00 Max. :144.00 Max. :48.00 mem : 577 ind : 586 Max. : 33.000 Max. :2016-04-24 Max. : 44.000 Max. :100.00 Max. :100.00 Max. : 380.000 Max. : 15128.072 Max. :135.35 Max. :146.36 Max. : 47.362 Max. : 21.3115
## (Other) :4979 (Other):9240 (Other):9019
One improvement to the data set would involve normalizing all games to have the same number of data points per game. Some games that ran long ended up having a lot more samples.
DQ_check <- ddply(lrDF_filtered, c("key"), summarise,
N = length(key))
# order by N
DQ_check <- DQ_check[order(DQ_check$N),]
# plot
p <- plot_ly(
x = DQ_check$Key,
y = DQ_check$N,
type = "bar")
p
# Wanted to save out the dataset at this point. Analysis will branch into seperate work efforts for a Logistic/Linear model building
# Also drop some columns as we move on to next step !!
lrDF_final <- lrDF_filtered
lrDF_final$dateOrig <- NULL
lrDF_final$ts <- NULL
lrDF_final$teamlonga <- NULL
lrDF_final$teamlongb <- NULL
lrDF_final$timestring <- NULL
lrDF_final$gameid <- NULL
lrDF_final$teamaml <- NULL
lrDF_final$teambml <- NULL
lrDF_final$dfa <- NULL
lrDF_final$dateStr <- NULL
names(lrDF_final)
## [1] "key" "fscorea" "fscoreb" "fscorea_fscoreb" "fscoreb_fscorea" "away_win" "home_win" "teamaspread" "overunder" "teambspread" "teama_vegas_fscore" "teamb_vegas_fscore" "scorea" "scoreb" "timeleft" "teama" "teamb" "scorea_scoreb" "date" "scoreb_scorea" "pct_complete" "pct_left" "cf1" "cf2" "teama_adj_fscore" "teamb_adj_fscore" "pfscoreb_pfscorea"
head(lrDF_final)
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea away_win home_win teamaspread overunder teambspread teama_vegas_fscore teamb_vegas_fscore scorea scoreb timeleft teama teamb scorea_scoreb date scoreb_scorea pct_complete pct_left cf1 cf2 teama_adj_fscore teamb_adj_fscore pfscoreb_pfscorea
## 1 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 42 52 25.20000 cha tor -10 2016-04-05 10 47.500000 52.50000 6.884284 0.14286204 93.58125 105.6813 12.100000
## 2 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 44 58 20.93333 cha tor -14 2016-04-05 14 56.388889 43.61111 10.569592 0.25452584 86.84792 102.5924 15.744444
## 3 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 19 28 35.13333 cha tor -9 2016-04-05 9 26.805556 73.19444 5.250891 0.08348447 90.91354 102.8413 11.927778
## 4 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 35 41 28.18333 cha tor -6 2016-04-05 6 41.284722 58.71528 3.906817 0.07411763 92.68776 101.0364 8.348611
## 5 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 40 52 25.50000 cha tor -12 2016-04-05 12 46.875000 53.12500 8.212631 0.16881814 92.19531 106.3203 14.125000
## 6 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 9 12 43.30000 cha tor -3 2016-04-05 3 9.791667 90.20833 1.577128 0.02120872 97.62969 104.2380 6.608333
write.csv(lrDF_final, file = "nba-datawrangle-lrDF.csv")